#if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene")
loadlibs <- function(){
library(reshape)
library(pheatmap)
library(RColorBrewer)
library(DT)
library(ggrepel)
library(viridis)
#library(Repitools)
library(knitr)
library(DESeq2)
library(DiffBind)
library(ChIPpeakAnno)
library(ChIPseeker)
library(ensembldb)
library(EnsDb.Hsapiens.v86)
library(AnnotationHub)
library(rtracklayer)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(tidyverse)
library(ChIPseeker)
library(clusterProfiler)
library(rtracklayer)
library(ChIPpeakAnno)
}
suppressPackageStartupMessages(loadlibs())
knitr::opts_chunk$set(
cache = FALSE,
dev = c("png", "pdf"),
error = TRUE,
highlight = TRUE,
message = FALSE,
prompt = FALSE,
tidy = FALSE,
warning = FALSE,
fig.align = 'center')
#getwd()
if (file.exists("./_setup.R")){
source("_setup.R")
}
# Set seed for reproducibility
set.seed(1454944673L)
viridis_col <- viridis_pal()(20)
## Load sample data
samples <- read.csv('../meta/Brg1_metadatasheet_NF.csv')
#head(samples)
# Load metadata
meta <- read.csv("../meta/Brg1_metadatasheet_NF_2.csv")
# Load counts
counts_con <- read.delim("../data/consensus/consensus-counts.tsv", row.names=1)
#meta
sanitize_datatable(meta, style='bootstrap')
# Read in data
counts <- read.delim("../meta/multiBAMsummary.tab", sep="\t")
# Remove genomic coordinate information
plot_counts <- data.frame(counts[, 4:ncol(counts)])
# Change column names
colnames(plot_counts) <- colnames(plot_counts) %>%
str_replace( "X.", "") %>%
str_replace(".NF.", "")
#all(meta$shortname %in% colnames(plot_counts))
plot_counts <- plot_counts[ ,meta$sample]
annotation <- as.data.frame(meta[,c("Phenotype", "Treatment")])
#all(meta$sample == colnames(plot_counts))
rownames(annotation) <- meta$sample
heat.colors <- brewer.pal(6, "YlOrRd")
#pheatmap(cor(plot_counts), color=heat.colors, annotation=annotation, fontsize = 8)
cat('\n\n')
## loading packages
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
NFnarrowPeak_tib <- meta
for (samplename in NFnarrowPeak_tib$SampleID) {
peaks_file <-read.delim(paste0("../data/peakcalls/",samplename,"_peaks.narrowPeak"), header =F)
df <- data.frame(peak_enrichment = peaks_file$V7, peak_rank = rank(dplyr::desc(peaks_file$V7))) %>%
dplyr::arrange(peak_rank)
assign(paste0(samplename,"_peak_file"), df)
}
for (sample_ID in NFnarrowPeak_tib$SampleID) {
obj <- ChIPpeakAnno::toGRanges(paste0("../data/peakcalls/",sample_ID,"_peaks.narrowPeak"), format="narrowPeak", header=FALSE)
assign(paste0(sample_ID,"_grange_file"), obj)
}
## check the genomic element distribution of the duplicates
## the genomic element distribution will indicates the
## the correlation between duplicates.
peaks <- GRangesList(`S7_H460_WT-NF_grange_file`, `S9_H2009_WT-NF_grange_file`)
oh4 <- findOverlapsOfPeaks(`S7_H460_WT-NF_grange_file`, `S8_H460_KO-NF_grange_file`, connectedPeaks= "merge")
#oh4$peaklist[["S7_H460_WT.NF_grange_file///S8_H460_KO.NF_grange_file"]][1:2]
makeVennDiagram(oh4, fill=c("#009E73", "#F0E442"), # circle fill color
col=c("#D55E00", "#0072B2"), #circle border color
cat.col=c("#D55E00", "#0072B2")) # label color, keep same as circle border color
## $p.value
## S7_H460_WT.NF_grange_file S8_H460_KO.NF_grange_file pval
## [1,] 1 1 0
##
## $vennCounts
## S7_H460_WT.NF_grange_file S8_H460_KO.NF_grange_file Counts
## [1,] 0 0 0
## [2,] 0 1 37778
## [3,] 1 0 84807
## [4,] 1 1 71237
## attr(,"class")
## [1] "VennCounts"
cat('\n\n')
Getting the unique peaks from WT and KO and looking for annotations.
Below plots shows the annotation distribution for Unique WT peaks n= 84807 Unique KO peaks n= 37778
library(EnsDb.Hsapiens.v75) ##(hg19)
## create annotation file from EnsDb or TxDb
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
#annoData[1:2]
peaks <- GRangesList(H460_unique_WT=oh4$peaklist$S7_H460_WT.NF_grange_file,
H460_unique_KO=oh4$peaklist$S8_H460_KO.NF_grange_file)
genomicElementDistribution(peaks,
TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene,
promoterRegion=c(upstream=2000, downstream=500),
geneDownstream=c(upstream=0, downstream=2000))
#metagenePlot(peaks, TxDb.Hsapiens.UCSC.hg19.knownGene)
cat('\n\n')
peaks9 <- GRangesList(`S9_H2009_WT-NF_grange_file`,`S10_H2009_KO-NF_grange_file`)
oh9 <- findOverlapsOfPeaks(`S9_H2009_WT-NF_grange_file`,`S10_H2009_KO-NF_grange_file`, connectedPeaks= "merge")
#oh9$peaklist[["S9_H2009_WT.NF_grange_file///S10_H2009_KO.NF_grange_file"]][1:2]
makeVennDiagram(oh9, fill=c("#009E73", "#F0E442"), # circle fill color
col=c("#D55E00", "#0072B2"), #circle border color
cat.col=c("#D55E00", "#0072B2")) # label color, keep same as circle border color
## $p.value
## S9_H2009_WT.NF_grange_file S10_H2009_KO.NF_grange_file pval
## [1,] 1 1 0
##
## $vennCounts
## S9_H2009_WT.NF_grange_file S10_H2009_KO.NF_grange_file Counts
## [1,] 0 0 0
## [2,] 0 1 68028
## [3,] 1 0 65486
## [4,] 1 1 102897
## attr(,"class")
## [1] "VennCounts"
H2009 sample seems to have similar peak calling however WT has more peaks than KO.
Below plots shows the annotation distribution for Unique WT peaks n= 65486 Unique KO peaks n= 37778
peaks9 <- GRangesList(H2009_unique_WT=oh9$peaklist$S9_H2009_WT.NF_grange_file,
H2009_unique_KO=oh9$peaklist$S10_H2009_KO.NF_grange_file)
genomicElementDistribution(peaks9,
TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene,
promoterRegion=c(upstream=2000, downstream=500),
geneDownstream=c(upstream=0, downstream=2000))
cat('\n\n')
peaksC <- GRangesList(`S11_Calu6_WT-NF_grange_file`,`S12_Calu6_KO-NF_grange_file`)
ohC <- findOverlapsOfPeaks(`S11_Calu6_WT-NF_grange_file`,`S12_Calu6_KO-NF_grange_file`, connectedPeaks= "merge")
#ohC$peaklist[["S11_Calu6_WT-NF_grange_file///S12_Calu6_KO-NF_grange_file"]][1:2]
makeVennDiagram(ohC, fill=c("#009E73", "#F0E442"), # circle fill color
col=c("#D55E00", "#0072B2"), #circle border color
cat.col=c("#D55E00", "#0072B2")) # label color, keep same as circle border color
## $p.value
## S11_Calu6_WT.NF_grange_file S12_Calu6_KO.NF_grange_file pval
## [1,] 1 1 0
##
## $vennCounts
## S11_Calu6_WT.NF_grange_file S12_Calu6_KO.NF_grange_file Counts
## [1,] 0 0 0
## [2,] 0 1 31687
## [3,] 1 0 126070
## [4,] 1 1 66548
## attr(,"class")
## [1] "VennCounts"
Similar pattern in Calu6 where WT has higher number of peaks called than KO.
Below plots shows the annotation distribution for Unique WT peaks n= 126070 Unique KO peaks n= 31687
peaksC <- GRangesList(Calu6_unique_WT=ohC$peaklist$S11_Calu6_WT.NF_grange_file,
Calu6_unique_KO=ohC$peaklist$S12_Calu6_KO.NF_grange_file)
genomicElementDistribution(peaksC,
TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene,
promoterRegion=c(upstream=2000, downstream=500),
geneDownstream=c(upstream=0, downstream=2000))
cat('\n\n')
However peak distribution in Calu6 KO sample shows more Promter, and Distal Intergenic regions.
Peak Overlap Between WT samples I treated all three WT samples as replicates and attempted to call peaks after combining peaks from three samples.
#ol <- findOverlapsOfPeaks(`S7_H460_WT-NF_grange_file`, `S9_H2009_WT-NF_grange_file`, `S11_Calu6_WT-NF_grange_file`, connectedPeaks="keepAll")
# WT Brg1
olaps_wt <- findOverlapsOfPeaks(`S7_H460_WT-NF_grange_file`, `S9_H2009_WT-NF_grange_file`, `S11_Calu6_WT-NF_grange_file`, connectedPeaks="merge")
ven<-makeVennDiagram(olaps_wt, totalTest=3e+3, connectedPeaks = "merge",
fill = c("cornflowerblue", "orange", "brown1"), NameOfPeaks = c("H460_WT", "H2009_WT", "Calu6_WT"))
overlaps_wt <- olaps_wt$peaklist$`S7_H460_WT.NF_grange_file///S9_H2009_WT.NF_grange_file///S11_Calu6_WT.NF_grange_file`
## add metadata (mean of score) to the overlapping peaks
#ol <- addMetadata(ol, colNames="score", FUN=mean)
cat('\n\n')
Peak Overlap Between KO samples
#ok <- findOverlapsOfPeaks(`S8_H460_KO-NF_grange_file`, `S10_H2009_KO-NF_grange_file`, `S12_Calu6_KO-NF_grange_file`, connectedPeaks="keepAll")
# WT Brg1
olaps_ko <- findOverlapsOfPeaks(`S8_H460_KO-NF_grange_file`, `S10_H2009_KO-NF_grange_file`, `S12_Calu6_KO-NF_grange_file`, connectedPeaks="merge")
ven<-makeVennDiagram(olaps_ko, totalTest=3e+3, connectedPeaks = "merge",
fill = c("cornflowerblue", "orange", "brown1"), NameOfPeaks = c("H460_KO", "H2009_KO", "Calu6_KO"))
overlaps_ko <- olaps_ko$peaklist$`S8_H460_KO.NF_grange_file///S10_H2009_KO.NF_grange_file///S12_Calu6_KO.NF_grange_file`
cat('\n\n')
There are rlength(olaps_ko\(peaklist\)S8_H460_KO.NF_grange_file///S10_H2009_KO.NF_grange_file///S12_Calu6_KO.NF_grange_file)`
regions common in KO samples.
#WT_cons vs KO_cons
olaps_final <- findOverlapsOfPeaks(overlaps_wt,overlaps_ko,connectedPeaks="merge")
ven<-makeVennDiagram(olaps_final, connectedPeaks = "merge")
olaps_cons_wt <- olaps_final$peaklist$overlaps_wt
olaps_cons_ko <- olaps_final$peaklist$overlaps_ko
peaks_con_list <- GRangesList(WT=olaps_cons_wt, KO=olaps_cons_ko)
cat('\n\n')
It looks like there are 18995 unique in WT samples while 7630 unique to KO samples.
If no precedence specified, double count will be enabled, which means that if a peak overlap with both promoter and 5’UTR, both promoter and 5’UTR will be incremented. If a precedence order is specified, for example, if promoter is specified before 5’UTR, then only promoter will be incremented for the same example. The values could be any conbinations of “Promoters”, “immediateDownstream”, “fiveUTRs”, “threeUTRs”, “Exons” and “Introns”, Default=NULL
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
library(patchwork)
annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene, feature="gene")
#annoData
data(TSS.human.NCBI36)
##WT
annotatedPeak_wt <- annotatePeakInBatch(olaps_cons_wt,
AnnotationData=TSS.human.NCBI36)
#annotatedPeak_wt[1:3]
#pie1(table(as.data.frame(annotatedPeak_wt)$insideFeature))
if(require(TxDb.Hsapiens.UCSC.hg19.knownGene)){
aCR<-assignChromosomeRegion(annotatedPeak_wt, nucleotideLevel=FALSE,
precedence=c("Promoters", "immediateDownstream",
"fiveUTRs", "threeUTRs",
"Exons", "Introns"),
TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene)
barplot(aCR$percentage)
title(main = "WT plot", font = 4)
}
annotatedPeak_wt <- addGeneIDs(annotatedPeak_wt, orgAnn="org.Hs.eg.db",
feature_id_type="ensembl_gene_id",
IDs2Add=c("symbol"))
#head(annotatedPeak_wt)
#head(overlaps.anno)
write.csv(as.data.frame(unname(annotatedPeak_wt)), "WT_consensus_anno.csv")
##KO
annotatedPeak_ko <- annotatePeakInBatch(olaps_cons_ko,
AnnotationData=TSS.human.NCBI36)
#annotatedPeak_wt[1:3]
#pie1(table(as.data.frame(annotatedPeak_ko)$insideFeature))
if(require(TxDb.Hsapiens.UCSC.hg19.knownGene)){
aCR<-assignChromosomeRegion(annotatedPeak_ko, nucleotideLevel=FALSE,
precedence=c("Promoters", "immediateDownstream",
"fiveUTRs", "threeUTRs",
"Exons", "Introns"),
TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene)
barplot(aCR$percentage)
title(main = "KO plot", font = 4)
}
annotatedPeak_ko <- addGeneIDs(annotatedPeak_ko, orgAnn="org.Hs.eg.db",
feature_id_type="ensembl_gene_id",
IDs2Add=c("symbol"))
#head(annotatedPeak_ko)
write.csv(as.data.frame(unname(annotatedPeak_ko)), "KO_consensus_anno.csv")
The label missing the barplot is “immediateDownstream”, due to size it’s not able to fit in xlabel text window.
library(GO.db)
library(patchwork)
library(org.Hs.eg.db)
library( reactome.db )
wt <- getEnrichedPATH(annotatedPeak_wt, "org.Hs.eg.db", "reactome.db", maxP=.05)
wt <- enrichmentPlot(wt) + ggtitle('WT')
wt
#KO
ko <- getEnrichedPATH(annotatedPeak_ko, "org.Hs.eg.db", "reactome.db", maxP=.05)
ko <- enrichmentPlot(ko) + ggtitle('KO')
ko
cat('\n\n')
we need to see how samples might be segregating and see if the variability observed can be explained by any other covariates. Identifying the main sources of variation will allow us to better model the data, and also validate outlier samples.
# Load counts
samples <- read.csv("../meta/metadata_rm.csv")
dObj <- dba(sampleSheet=samples)
dObj <- dba.count(dObj)
info <- dba.show(dObj)
libsizes <- cbind(LibReads=info$Reads, FRiP=info$FRiP,PeakReads=round(info$Reads * info$FRiP))
rownames(libsizes) <- info$ID
#Normalizing the data
dObj <- dba.normalize(dObj)
norm <- dba.normalize(dObj, bRetrieve=TRUE)
normlibs <- cbind(FullLibSize=norm$lib.sizes, NormFacs=norm$norm.factors, NormLibSize=round(norm$lib.sizes/norm$norm.factors))
rownames(normlibs) <- info$ID
#Establishing a model design and contrast
dObj <- dba.contrast(dObj, categories=DBA_TREATMENT)
dba.plotPCA(dObj)
#Performing the differential analysis
#dObj <- dba.analyze(dObj)
dObj <- dba.analyze(dObj, method=DBA_ALL_METHODS)
#Retrieving the differentially bound sites
dObj.DB <- dba.report(dObj)
#print("no genes found with fold change more than 0")
#sum(dObj.DB$Fold>0)
#print("genes found to have fold change less than 0")
#sum(dObj.DB$Fold<0)
meta_H4 <- read.csv("../meta/metadata_rm.csv")
meta_H4 <- meta_H4 %>%
column_to_rownames(var="sample")
counts_H4 <- counts_con[,rownames(meta_H4)]
#all
dds_all <- DESeqDataSetFromMatrix(counts_H4, meta_H4, design=~cellline + Factor)
dds_all <- DESeq(dds_all)
cat('\n\n')
#meta
From the PCA we observe a samples from same celllines are close which associates with the sample groups.
# Matrix of transformed counts for downstream visualization
rld1 <- rlog(dds_all, blind = TRUE)
# Compute principal components
pc <- prcomp(t(assay(rld1)))
plot_pca <- data.frame(pc$x, colData(dds_all))
# Plot with sample names used as data points
ggplot(plot_pca, aes(x = PC1, y = PC2, label = rownames(plot_pca))) +
geom_point(aes(color = cellline,shape = Factor), size = 3) +
geom_text_repel() +
xlab('PC1') + ylab('PC2') +
theme_bw() +
theme(axis.title = element_text(size = rel(1.25)),
axis.text = element_text(size = rel(1.15)),
plot.title = element_text(size = rel(1.5), hjust = 0.5),
legend.title = element_blank()) +
scale_color_manual(values=viridis_col[c(1,20)])
cat('\n\n')
cat('\n\n')
We do not have replicates in this study, so we may not be able to get differential expressed genes from two conditions i.e. WT and KO for each cell line. However, since we have Brg1 KO and WT samples from three cell lines,i.e. H460, H2009 and Calu6 we may try finding DE genes bt treatind each condition sample as replicates ie. WT samples vs KO samples.
We often look at the dispersion plot to get a good idea of whether or not our data is a good fit for the model. Dispersion is a metric for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene and is used in the final GLM fit. From this plot we see that:
plotDispEsts(dds_all)
# filter
keep <- rowSums(counts(dds_all) >= 10) >= 3
dds_all <- dds_all[keep,]
dds_all <- DESeq(dds_all)
# plot again
plotDispEsts(dds_all)
cat('\n\n')
The MA plot explores the mean expression level of the genes with the fold change, highlighting the regions that are differentially enriched (padj < 0.05) using blue colored data points.
# Get results
res_all <- results(dds_all,contrast=c("Factor","WT","KO"))
resultsNames(dds_all)
## [1] "Intercept" "cellline_H460_vs_Calu6" "Factor_WT_vs_KO"
res_all <- lfcShrink(dds_all, coef="Factor_WT_vs_KO", type="apeglm")
plotMA(res_all, alpha = 0.5)
#res_all <- sort(res_all@listData[["padj"]])
cat('\n\n')
At a padj < 0.05, we find:
Here, we plot the log2 fold change of each region against the log10 adjusted p-value. The points highlighted in teal are regions that have padj < 0.05.
The contrast is setup such that positive logFC means that there is an increased enrichment of a region in WT compared to KO.
nrow(res_all[which(res_all$padj < 0.05),])
## [1] 284
# Create dataframe
res_all <- res_all %>%
as.data.frame() %>%
mutate(threshold = padj < 0.05) %>%
arrange(padj)
res_all_filt <- res_all %>%
as.data.frame() %>%
mutate(threshold = padj < 0.05) %>%
arrange(padj) %>% filter(threshold == "TRUE")
## Volcano plot
p1 <- ggplot(res_all) +
geom_point(aes(log2FoldChange, y = -log10(padj), colour = threshold)) +
xlab("log2 fold change") +
ylab("-log10 adjusted p-value") +
ggtitle("WT vs KO") +
theme_bw() +
theme(
legend.position = "none",
plot.title = element_text(size = rel(2.0), hjust = 0.5),
axis.title = element_text(size = rel(1.25))
)
p1
cat('\n\n')
library(GenomicFeatures)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene #shorthand (for convenience)
# Make TxDb
#txdb
# Create GRanges
gr_all <- data.frame(rows = rownames(res_all_filt)) %>%
separate(rows, c("chr", "start", "end")) %>%
cbind(res_all_filt) %>%
remove_rownames() %>%
makeGRangesFromDataFrame(keep.extra.columns=TRUE,
ignore.strand=TRUE,
seqnames.field=c("chr"),
start.field="start",
end.field=c("end"))
# Annotate
WT_KO <- annotatePeak(gr_all, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Hs.eg.db")
## >> preparing features information... 2022-04-12 21:14:33
## >> identifying nearest features... 2022-04-12 21:14:34
## >> calculating distance from peak to TSS... 2022-04-12 21:14:34
## >> assigning genomic annotation... 2022-04-12 21:14:34
## >> adding gene annotation... 2022-04-12 21:14:45
## >> assigning chromosome lengths 2022-04-12 21:14:45
## >> done... 2022-04-12 21:14:45
plotAnnoPie(WT_KO)
#plotAnnoBar(WT_KO)
#upsetplot(WT_KO)
#vennpie(WT_KO)
library(ggupset)
library(ggimage)
#upsetplot(WT_KO, vennpie=TRUE)
# Query AnnotationHub to get gene symbols
ah <- AnnotationHub()
# Query AnnotationHub
human_ens <- query(ah, c("Homo sapiens", "EnsDb"))
# Extract annotations of interest
human_ens <- human_ens[["AH75011"]]
# Extract gene-level information
#genes(human_ens, return.type = "data.frame") %>% head()
# Create a gene-level dataframe
annotations_ahb <- genes(human_ens, return.type = "data.frame") %>%
dplyr::select(gene_name,gene_id, entrezid, gene_biotype,symbol)
# Keep one entrez Id per gene
#annotations_ahb$entrezid <- map(annotations_ahb$entrezid,1) %>% unlist()
# Combine annotations with results
WT_KO_all <- WT_KO@anno %>%
data.frame() %>%
dplyr::select(-geneChr, -geneStart, -geneEnd, -geneLength, -geneStrand) %>%
left_join(annotations_ahb, by=c("geneId" = "gene_id"))
#WT_KO_all
plotDistToTSS(WT_KO,
title="Distribution of transcription factor-binding loci\nrelative to TSS")
cat('\n\n')
Below we have separated significant peaks by direction of change and report a few of the statistic columns.
There is only one region found which is significantly different in WT vs KO.
library(tidyverse)
sigUp <- WT_KO_all %>%
dplyr::filter(threshold == "TRUE" & log2FoldChange > 0) %>%
arrange(padj) %>%
mutate_if(is.numeric, round, digits = 7)
sanitize_datatable(sigUp[,c("seqnames", "start", "end", "SYMBOL" , "annotation", "log2FoldChange", "pvalue", "padj","gene_name")])
cat('\n\n')
I removed the H460 cell-line from the set.
meta_H9 <- read.csv("../meta/metadata_rm2.csv")
meta_H9 <- meta_H9 %>%
column_to_rownames(var="sample")
counts_H9 <- counts_con[,rownames(meta_H9)]
#all
dds_all <- DESeqDataSetFromMatrix(counts_H9, meta_H9, design=~cellline + Factor)
dds_all <- DESeq(dds_all)
cat('\n\n')
#meta
From the PCA we observe a samples from same celllines are close which associates with the sample groups.
# Matrix of transformed counts for downstream visualization
rld1 <- rlog(dds_all, blind = TRUE)
# Compute principal components
pc <- prcomp(t(assay(rld1)))
plot_pca <- data.frame(pc$x, colData(dds_all))
# Plot with sample names used as data points
ggplot(plot_pca, aes(x = PC1, y = PC2, label = rownames(plot_pca))) +
geom_point(aes(color = cellline,shape = Factor), size = 3) +
geom_text_repel() +
xlab('PC1') + ylab('PC2') +
theme_bw() +
theme(axis.title = element_text(size = rel(1.25)),
axis.text = element_text(size = rel(1.15)),
plot.title = element_text(size = rel(1.5), hjust = 0.5),
legend.title = element_blank()) +
scale_color_manual(values=viridis_col[c(1,20)])
cat('\n\n')
cat('\n\n')
plotDispEsts(dds_all)
# filter
keep <- rowSums(counts(dds_all) >= 10) >= 3
dds_all <- dds_all[keep,]
dds_all <- DESeq(dds_all)
# plot again
plotDispEsts(dds_all)
cat('\n\n')
The MA plot explores the mean expression level of the genes with the fold change, highlighting the regions that are differentially enriched (padj < 0.05) using blue colored data points.
# Get results
res_all <- results(dds_all,contrast=c("Factor","WT","KO"))
resultsNames(dds_all)
## [1] "Intercept" "cellline_H2009_vs_Calu6"
## [3] "Factor_WT_vs_KO"
res_all <- lfcShrink(dds_all, coef="Factor_WT_vs_KO", type="apeglm")
plotMA(res_all, alpha = 0.5)
#res_all <- sort(res_all@listData[["padj"]])
cat('\n\n')
At a padj < 0.05, we find:
Here, we plot the log2 fold change of each region against the log10 adjusted p-value. The points highlighted in teal are regions that have padj < 0.05.
The contrast is setup such that positive logFC means that there is an increased enrichment of a region in WT compared to KO.
nrow(res_all[which(res_all$padj < 0.05),])
## [1] 2
# Create dataframe
res_all <- res_all %>%
as.data.frame() %>%
mutate(threshold = padj < 0.05) %>%
arrange(padj)
res_all_filt <- res_all %>%
as.data.frame() %>%
mutate(threshold = padj < 0.05) %>%
arrange(padj) %>% filter(threshold == "TRUE")
## Volcano plot
p1 <- ggplot(res_all) +
geom_point(aes(log2FoldChange, y = -log10(padj), colour = threshold)) +
xlab("log2 fold change") +
ylab("-log10 adjusted p-value") +
ggtitle("WT vs KO") +
theme_bw() +
theme(
legend.position = "none",
plot.title = element_text(size = rel(2.0), hjust = 0.5),
axis.title = element_text(size = rel(1.25))
)
p1
cat('\n\n')
library(GenomicFeatures)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene #shorthand (for convenience)
# Make TxDb
#txdb
# Create GRanges
gr_all <- data.frame(rows = rownames(res_all_filt)) %>%
separate(rows, c("chr", "start", "end")) %>%
cbind(res_all_filt) %>%
remove_rownames() %>%
makeGRangesFromDataFrame(keep.extra.columns=TRUE,
ignore.strand=TRUE,
seqnames.field=c("chr"),
start.field="start",
end.field=c("end"))
# Annotate
WT_KO <- annotatePeak(gr_all, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Hs.eg.db")
## >> preparing features information... 2022-04-12 21:18:11
## >> identifying nearest features... 2022-04-12 21:18:11
## >> calculating distance from peak to TSS... 2022-04-12 21:18:11
## >> assigning genomic annotation... 2022-04-12 21:18:11
## >> adding gene annotation... 2022-04-12 21:18:13
## >> assigning chromosome lengths 2022-04-12 21:18:13
## >> done... 2022-04-12 21:18:13
plotAnnoPie(WT_KO)
#plotAnnoBar(WT_KO)
#upsetplot(WT_KO)
#vennpie(WT_KO)
library(ggupset)
library(ggimage)
#upsetplot(WT_KO, vennpie=TRUE)
# Query AnnotationHub to get gene symbols
ah <- AnnotationHub()
# Query AnnotationHub
human_ens <- query(ah, c("Homo sapiens", "EnsDb"))
# Extract annotations of interest
human_ens <- human_ens[["AH75011"]]
# Extract gene-level information
#genes(human_ens, return.type = "data.frame") %>% head()
# Create a gene-level dataframe
annotations_ahb <- genes(human_ens, return.type = "data.frame") %>%
dplyr::select(gene_name,gene_id, entrezid, gene_biotype,symbol)
# Keep one entrez Id per gene
#annotations_ahb$entrezid <- map(annotations_ahb$entrezid,1) %>% unlist()
# Combine annotations with results
WT_KO_all <- WT_KO@anno %>%
data.frame() %>%
dplyr::select(-geneChr, -geneStart, -geneEnd, -geneLength, -geneStrand) %>%
left_join(annotations_ahb, by=c("geneId" = "gene_id"))
#WT_KO_all
plotDistToTSS(WT_KO,
title="Distribution of transcription factor-binding loci\nrelative to TSS")
## Error in seq.default(-uslim, dslim, by = 10): 'from' must be a finite number
cat('\n\n')
Below we have separated significant peaks by direction of change and report a few of the statistic columns.
There is only one region found which is significantly different in WT vs KO.
library(tidyverse)
sigUp <- WT_KO_all %>%
dplyr::filter(threshold == "TRUE" & log2FoldChange > 0) %>%
arrange(padj) %>%
mutate_if(is.numeric, round, digits = 7)
sanitize_datatable(sigUp[,c("seqnames", "start", "end", "SYMBOL" , "annotation", "log2FoldChange", "pvalue", "padj","gene_name")])
cat('\n\n')
Below is the output of the R session used to generate this report. Included is information on R version, OS, and versions of packages installed and used.
sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggimage_0.3.0
## [2] ggupset_0.3.0
## [3] reactome.db_1.77.0
## [4] GO.db_3.14.0
## [5] patchwork_1.1.1
## [6] org.Hs.eg.db_3.14.0
## [7] EnsDb.Hsapiens.v75_2.99.0
## [8] clusterProfiler_4.2.2
## [9] forcats_0.5.1
## [10] stringr_1.4.0
## [11] dplyr_1.0.8
## [12] purrr_0.3.4
## [13] readr_2.1.2
## [14] tidyr_1.2.0
## [15] tibble_3.1.6
## [16] tidyverse_1.3.1
## [17] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [18] rtracklayer_1.54.0
## [19] AnnotationHub_3.2.2
## [20] BiocFileCache_2.2.1
## [21] dbplyr_2.1.1
## [22] EnsDb.Hsapiens.v86_2.99.0
## [23] ensembldb_2.18.4
## [24] AnnotationFilter_1.18.0
## [25] GenomicFeatures_1.46.5
## [26] AnnotationDbi_1.56.2
## [27] ChIPseeker_1.30.3
## [28] ChIPpeakAnno_3.28.1
## [29] DiffBind_3.4.11
## [30] DESeq2_1.34.0
## [31] SummarizedExperiment_1.24.0
## [32] Biobase_2.54.0
## [33] MatrixGenerics_1.6.0
## [34] matrixStats_0.61.0
## [35] GenomicRanges_1.46.1
## [36] GenomeInfoDb_1.30.1
## [37] IRanges_2.28.0
## [38] S4Vectors_0.32.4
## [39] BiocGenerics_0.40.0
## [40] knitr_1.38
## [41] viridis_0.6.2
## [42] viridisLite_0.4.0
## [43] ggrepel_0.9.1
## [44] ggplot2_3.3.5
## [45] DT_0.22
## [46] RColorBrewer_1.1-3
## [47] pheatmap_1.0.12
## [48] reshape_0.8.9
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 coda_0.19-4
## [3] bit64_4.0.5 irlba_2.3.5
## [5] DelayedArray_0.20.0 data.table_1.14.2
## [7] hwriter_1.3.2.1 KEGGREST_1.34.0
## [9] RCurl_1.98-1.6 generics_0.1.2
## [11] lambda.r_1.2.4 RSQLite_2.2.12
## [13] shadowtext_0.1.1 tzdb_0.3.0
## [15] bit_4.0.4 enrichplot_1.14.2
## [17] xml2_1.3.3 lubridate_1.8.0
## [19] httpuv_1.6.5 assertthat_0.2.1
## [21] amap_0.8-18 apeglm_1.16.0
## [23] xfun_0.30 hms_1.1.1
## [25] jquerylib_0.1.4 evaluate_0.15
## [27] promises_1.2.0.1 fansi_1.0.3
## [29] restfulr_0.0.13 progress_1.2.2
## [31] readxl_1.4.0 caTools_1.18.2
## [33] igraph_1.3.0 DBI_1.1.2
## [35] geneplotter_1.72.0 htmlwidgets_1.5.4
## [37] futile.logger_1.4.3 ellipsis_0.3.2
## [39] crosstalk_1.2.0 backports_1.4.1
## [41] annotate_1.72.0 biomaRt_2.50.3
## [43] vctrs_0.4.0 cachem_1.0.6
## [45] withr_2.5.0 ggforce_0.3.3
## [47] BSgenome_1.62.0 bdsmatrix_1.3-4
## [49] GenomicAlignments_1.30.0 treeio_1.18.1
## [51] prettyunits_1.1.1 DOSE_3.20.1
## [53] ape_5.6-2 lazyeval_0.2.2
## [55] crayon_1.5.1 genefilter_1.76.0
## [57] labeling_0.4.2 pkgconfig_2.0.3
## [59] tweenr_1.0.2 nlme_3.1-157
## [61] ProtGenerics_1.26.0 rlang_1.0.2
## [63] lifecycle_1.0.1 downloader_0.4
## [65] filelock_1.0.2 modelr_0.1.8
## [67] VennDiagram_1.7.1 invgamma_1.1
## [69] cellranger_1.1.0 polyclip_1.10-0
## [71] graph_1.72.0 Matrix_1.4-1
## [73] aplot_0.1.3 ashr_2.2-54
## [75] boot_1.3-28 reprex_2.0.1
## [77] png_0.1-7 rjson_0.2.21
## [79] bitops_1.0-7 KernSmooth_2.23-20
## [81] Biostrings_2.62.0 blob_1.2.3
## [83] mixsqp_0.3-43 SQUAREM_2021.1
## [85] qvalue_2.26.0 regioneR_1.26.1
## [87] ShortRead_1.52.0 jpeg_0.1-9
## [89] gridGraphics_0.5-1 scales_1.1.1
## [91] memoise_2.0.1 magrittr_2.0.3
## [93] plyr_1.8.7 gplots_3.1.1
## [95] zlibbioc_1.40.0 compiler_4.1.3
## [97] scatterpie_0.1.7 BiocIO_1.4.0
## [99] bbmle_1.0.24 plotrix_3.8-2
## [101] Rsamtools_2.10.0 cli_3.2.0
## [103] systemPipeR_2.0.6 XVector_0.34.0
## [105] formatR_1.12 MASS_7.3-56
## [107] tidyselect_1.1.2 stringi_1.7.6
## [109] highr_0.9 emdbook_1.3.12
## [111] yaml_2.3.5 GOSemSim_2.20.0
## [113] locfit_1.5-9.5 latticeExtra_0.6-29
## [115] grid_4.1.3 sass_0.4.1
## [117] fastmatch_1.1-3 tools_4.1.3
## [119] parallel_4.1.3 rstudioapi_0.13
## [121] gridExtra_2.3 farver_2.1.0
## [123] ggraph_2.0.5 digest_0.6.29
## [125] BiocManager_1.30.16 shiny_1.7.1
## [127] Rcpp_1.0.8.3 broom_0.7.12
## [129] BiocVersion_3.14.0 later_1.3.0
## [131] httr_1.4.2 colorspace_2.0-3
## [133] rvest_1.0.2 fs_1.5.2
## [135] XML_3.99-0.9 truncnorm_1.0-8
## [137] splines_4.1.3 yulab.utils_0.0.4
## [139] RBGL_1.70.0 tidytree_0.3.9
## [141] graphlayouts_0.8.0 multtest_2.50.0
## [143] ggplotify_0.1.0 xtable_1.8-4
## [145] jsonlite_1.8.0 ggtree_3.2.1
## [147] futile.options_1.0.1 tidygraph_1.2.1
## [149] ggfun_0.0.6 R6_2.5.1
## [151] pillar_1.7.0 htmltools_0.5.2
## [153] mime_0.12 glue_1.6.2
## [155] fastmap_1.1.0 BiocParallel_1.28.3
## [157] interactiveDisplayBase_1.32.0 fgsea_1.20.0
## [159] GreyListChIP_1.26.0 mvtnorm_1.1-3
## [161] utf8_1.2.2 lattice_0.20-45
## [163] bslib_0.3.1 numDeriv_2016.8-1.1
## [165] curl_4.3.2 gtools_3.9.2
## [167] magick_2.7.3 survival_3.3-1
## [169] limma_3.50.3 rmarkdown_2.13
## [171] InteractionSet_1.22.0 munsell_0.5.0
## [173] DO.db_2.9 GenomeInfoDbData_1.2.7
## [175] haven_2.4.3 reshape2_1.4.4
## [177] gtable_0.3.0